Part 1: Data Scraping

library(rvest)
library(tidyverse)
url <-"html_top100.txt"
college_urls <- url %>%
  read_html() %>%
  html_node("body") %>% html_nodes("ol[class~=bEyEue]") %>% html_nodes("li[id]")%>% html_nodes("h3") %>% 
  html_nodes("a[href]") %>%
  html_attr("href") 
head(college_urls)
## [1] "/best-colleges/princeton-university-2627"                 
## [2] "/best-colleges/harvard-university-2155"                   
## [3] "/best-colleges/columbia-university-2707"                  
## [4] "/best-colleges/massachusetts-institute-of-technology-2178"
## [5] "/best-colleges/university-of-chicago-1774"                
## [6] "/best-colleges/yale-university-1426"
index_num <- 0

college_tab_1 <-  data.frame("URL" = gsub(" ", "", paste("https://www.usnews.com",college_urls, sep = "")), 
"CollegeName"= "", "TuitionFeesThousands" = 0, "RoomBoardThousands" = 0, "TotalEnrollment" = 0, "SchoolType" = "", "YearFounded" = 0, "Setting" = "", "Endowment2017Millions" = 0, "MedianStartingSalaryOfAlumniThousands" = 0, "Selectivity" = "", "Fall2017AcceptanceRate" = 0, "MalePercentage" = 0, "FourYearGraduationRate" = 0, stringsAsFactors = FALSE) 

#removing one college that doesn't have a median starting salary, for data uniformity
college_tab_1 <- college_tab_1[-c(40),]

head(college_tab_1)
##                                                                               URL
## 1                  https://www.usnews.com/best-colleges/princeton-university-2627
## 2                    https://www.usnews.com/best-colleges/harvard-university-2155
## 3                   https://www.usnews.com/best-colleges/columbia-university-2707
## 4 https://www.usnews.com/best-colleges/massachusetts-institute-of-technology-2178
## 5                 https://www.usnews.com/best-colleges/university-of-chicago-1774
## 6                       https://www.usnews.com/best-colleges/yale-university-1426
##   CollegeName TuitionFeesThousands RoomBoardThousands TotalEnrollment
## 1                                0                  0               0
## 2                                0                  0               0
## 3                                0                  0               0
## 4                                0                  0               0
## 5                                0                  0               0
## 6                                0                  0               0
##   SchoolType YearFounded Setting Endowment2017Millions
## 1                      0                             0
## 2                      0                             0
## 3                      0                             0
## 4                      0                             0
## 5                      0                             0
## 6                      0                             0
##   MedianStartingSalaryOfAlumniThousands Selectivity Fall2017AcceptanceRate
## 1                                     0                                  0
## 2                                     0                                  0
## 3                                     0                                  0
## 4                                     0                                  0
## 5                                     0                                  0
## 6                                     0                                  0
##   MalePercentage FourYearGraduationRate
## 1              0                      0
## 2              0                      0
## 3              0                      0
## 4              0                      0
## 5              0                      0
## 6              0                      0

Below are functions used to obtain data from the website and parse it.

#retrieves of vector of size three containing the Tuition&Fees, Room&Board, and total enrollment
get_info <- function(url_html){
  attr <- url_html %>% html_node("body") %>% html_nodes("div[id~=content-main]") %>%   
    html_nodes("section[class~=hero-stats-widget-stats]") %>%
    html_nodes("ul") %>% html_nodes("li") %>% html_nodes("strong")
}

#takes in a vector and index, and parses that information to a double
#ex: $47,263 -> 47263.0
get_tuition_rm <- function(info, num){
  a_1 <- info[num] %>%  html_text()
  tuition_rm <- 
    as.double(paste(substring(a_1, 2, str_locate(a_1, ",")[1] - 1), substring(a_1, str_locate(a_1, ",")[1] + 1, str_locate(a_1, " ")[1] - 1), sep=""))
  tuition_rm / 1000.0
}

#takes in a vector and parses the total enrollment information to a double
get_enrollment <- function(info){
  a_1 <- info[3] %>%  html_text()
  as.double(paste(substring(a_1, 1, str_locate(a_1, ",")[1] - 1), substring(a_1, str_locate(a_1, ",")[1] + 1), sep=""))
}

#gets the percentage of the majority gender at a certain university
get_percent <- function(url_html){
  attr <- url_html %>% html_node("body") %>% html_nodes("div[id~=content-main]") %>%   
    html_nodes("div[class~=block-normal]") %>% html_nodes("span[class~=distribution-breakdown__percentage]") %>% html_text()
  as.double(substring(attr, 1, str_locate(attr, "%")[1] - 1)) / 100.0
}

#retrieves the gender of the majority sex and parses the percentage to be in terms of males
get_gender_ratio <- function(url_html){
  attr <- url_html %>% html_node("body") %>% html_nodes("div[id~=content-main]") %>%   
    html_nodes("div[class~=block-normal]") %>% html_nodes("span[class~=distribution-breakdown__percentage-copy]") %>% html_text()
  attr <- sub("\n                    ","",attr)
  attr <- sub("\n                ","",attr)
  if (attr == "Female"){
    1 - get_percent(url_html)
  }else{
    get_percent(url_html)
  }
}

Here, we use both the functions above and the html_node function to fill out the table.

college_tab <- college_tab_1

for (i in 1:nrow(college_tab)){
  url_html <- college_tab[i,1] %>%read_html()
  college_tab[i,]$CollegeName <- url_html %>% html_node("body") %>% html_nodes("h1[class~=hero-heading]") %>% html_text()
  priv_tuition <- url_html %>% html_node("body") %>% html_nodes("span[data-test-id~=v_private_tuition]") %>% html_text()
  college_tab[i,]$TuitionFeesThousands <- ifelse(length(priv_tuition) > 0, priv_tuition, 
                                                 url_html %>% html_node("body") %>% html_node("span[data-test-id~=v_out_state_tuition]") %>% html_text())
  college_tab[i,]$RoomBoardThousands <- url_html %>% html_node("body") %>% html_node("span[data-test-id~=w_room_board]") %>% html_text()
  college_tab[i,]$TotalEnrollment <- url_html %>% html_node("body") %>% html_node("span[data-test-id~=total_all_students]") %>% html_text()
  college_tab[i,]$MalePercentage <- get_gender_ratio(url_html)
  college_tab[i,]$Fall2017AcceptanceRate <- url_html %>% html_node("span[data-test-id~=r_c_accept_rate]") %>% html_text()
  college_tab[i,]$Selectivity <- url_html %>% html_node("span[data-test-id~=c_select_class]") %>% html_text()
  college_tab[i,]$FourYearGraduationRate <- url_html %>% html_node("span[data-test-id~=grad_rate_4_year]") %>% html_text()
  college_tab[i,]$MedianStartingSalaryOfAlumniThousands <-  url_html %>% html_nodes("div[data-field-id=averageStartSalary]") %>%html_node("span[data-test-id]") %>% html_text()
  temp_vector <- url_html %>% html_node("body") %>% html_nodes("div[id~=content-main]") %>%html_nodes("div[class~=flex-row]") %>%   html_nodes("span[class~=heading-small]") %>% html_text()
  college_tab[i,]$SchoolType <- temp_vector[1]
  college_tab[i,]$YearFounded <- temp_vector[2]
  college_tab[i,]$Setting <- temp_vector[5]
  college_tab[i,]$Endowment2017Millions  <- temp_vector[6]
}

head(college_tab)
##                                                                               URL
## 1                  https://www.usnews.com/best-colleges/princeton-university-2627
## 2                    https://www.usnews.com/best-colleges/harvard-university-2155
## 3                   https://www.usnews.com/best-colleges/columbia-university-2707
## 4 https://www.usnews.com/best-colleges/massachusetts-institute-of-technology-2178
## 5                 https://www.usnews.com/best-colleges/university-of-chicago-1774
## 6                       https://www.usnews.com/best-colleges/yale-university-1426
##                                             CollegeName
## 1                  \n        Princeton University\n    
## 2                    \n        Harvard University\n    
## 3                   \n        Columbia University\n    
## 4 \n        Massachusetts Institute of Technology\n    
## 5                 \n        University of Chicago\n    
## 6                       \n        Yale University\n    
##              TuitionFeesThousands              RoomBoardThousands
## 1 \n            $47,140 (2018-19) \n            $15,610 (2018-19)
## 2 \n            $50,420 (2018-19) \n            $17,160 (2018-19)
## 3 \n            $59,430 (2018-19) \n            $14,016 (2018-19)
## 4 \n            $51,832 (2018-19) \n            $15,510 (2018-19)
## 5 \n            $57,006 (2018-19) \n            $16,350 (2018-19)
## 6 \n            $53,430 (2018-19) \n            $16,000 (2018-19)
##        TotalEnrollment    SchoolType YearFounded  Setting
## 1  \n            8,273 Private, Coed        1746 Suburban
## 2 \n            20,604 Private, Coed        1636    Urban
## 3 \n            25,968 Private, Coed        1754    Urban
## 4 \n            11,466 Private, Coed        1861    Urban
## 5 \n            13,736 Private, Coed        1890    Urban
## 6 \n            12,974 Private, Coed        1701     City
##   Endowment2017Millions MedianStartingSalaryOfAlumniThousands
## 1         $23.4 billion                \n            $68,400*
## 2         $37.1 billion                \n            $66,500*
## 3         $10.0 billion                \n            $64,900*
## 4       $14.8 billion +                \n            $79,800*
## 5        $6.6 billion +                \n            $57,700*
## 6       $27.2 billion +                \n            $63,200*
##                    Selectivity Fall2017AcceptanceRate MalePercentage
## 1 \n            Most selective       \n            6%           0.51
## 2 \n            Most selective       \n            5%           0.52
## 3 \n            Most selective       \n            6%           0.52
## 4 \n            Most selective       \n            7%           0.54
## 5 \n            Most selective       \n            9%           0.51
## 6 \n            Most selective       \n            7%           0.50
##   FourYearGraduationRate
## 1      \n            89%
## 2      \n            84%
## 3      \n            88%
## 4      \n            85%
## 5      \n            88%
## 6      \n            87%

Below, we reformat many of the columns to get usable data.

formatted_college_tab <- college_tab
#fix type of School Type, Setting, Year Founded
formatted_college_tab$SchoolType <- as.factor(formatted_college_tab$SchoolType)
formatted_college_tab$Setting <- as.factor(formatted_college_tab$Setting)
formatted_college_tab$YearFounded <- as.integer(formatted_college_tab$YearFounded)
#fix Endowment2017 formatting
formatted_college_tab$Endowment2017Millions  <- ifelse(grepl("billion", formatted_college_tab$Endowment2017Millions ), sub("\\.","",formatted_college_tab$Endowment2017Millions ),formatted_college_tab$Endowment2017Millions )
formatted_college_tab$Endowment2017Millions  <-sub(" billion","00",formatted_college_tab$Endowment2017Millions )
formatted_college_tab$Endowment2017Millions  <-sub(" million","",formatted_college_tab$Endowment2017Millions )
formatted_college_tab$Endowment2017Millions  <-sub("[[:punct:]]", "",formatted_college_tab$Endowment2017Millions )
formatted_college_tab$Endowment2017Millions  <-sub("\\$", "",formatted_college_tab$Endowment2017Millions )
formatted_college_tab$Endowment2017Millions  <-sub(" \\+", "",formatted_college_tab$Endowment2017Millions )
formatted_college_tab$Endowment2017Millions <- as.double(formatted_college_tab$Endowment2017Millions)
#fix College Name formatting
formatted_college_tab$CollegeName <- sub("^\n        ","",formatted_college_tab$CollegeName)
formatted_college_tab$CollegeName <-sub("\n    ","",formatted_college_tab$CollegeName)
#fixing Acceptance Rate formatting
formatted_college_tab$Fall2017AcceptanceRate <- sub("\n            ","",formatted_college_tab$Fall2017AcceptanceRate)
formatted_college_tab$Fall2017AcceptanceRate <- sub("%","",formatted_college_tab$Fall2017AcceptanceRate)
formatted_college_tab$Fall2017AcceptanceRate <- as.double(formatted_college_tab$Fall2017AcceptanceRate)
formatted_college_tab$Fall2017AcceptanceRate <- formatted_college_tab$Fall2017AcceptanceRate/100
#fixing Grad Rate formatting
formatted_college_tab$FourYearGraduationRate <- sub("\n            ","",formatted_college_tab$FourYearGraduationRate)
formatted_college_tab$FourYearGraduationRate <- sub("%","",formatted_college_tab$FourYearGraduationRate)
formatted_college_tab$FourYearGraduationRate <- as.double(formatted_college_tab$FourYearGraduationRate)
formatted_college_tab$FourYearGraduationRate <- formatted_college_tab$FourYearGraduationRate/100
#fixing Salary formatting
formatted_college_tab$MedianStartingSalaryOfAlumniThousands <- 
  sub("\n            ","",formatted_college_tab$MedianStartingSalaryOfAlumniThousands)
formatted_college_tab$MedianStartingSalaryOfAlumniThousands <- gsub("\\*","",formatted_college_tab$MedianStartingSalaryOfAlumniThousands)
formatted_college_tab$MedianStartingSalaryOfAlumniThousands <- gsub("\\$","",formatted_college_tab$MedianStartingSalaryOfAlumniThousands)
formatted_college_tab$MedianStartingSalaryOfAlumniThousands <- gsub("\\,","",formatted_college_tab$MedianStartingSalaryOfAlumniThousands)
formatted_college_tab$MedianStartingSalaryOfAlumniThousands <- as.double(formatted_college_tab$MedianStartingSalaryOfAlumniThousands)/1000
#fixing Selectivity formatting
formatted_college_tab$Selectivity <- sub("\n            ","",formatted_college_tab$Selectivity)
formatted_college_tab$Selectivity <- as.factor(formatted_college_tab$Selectivity)
#fixing Tuition formatting
formatted_college_tab$TuitionFeesThousands <- sub("\n            ", "",formatted_college_tab$TuitionFeesThousands )
formatted_college_tab$TuitionFeesThousands <- sub(" \\(2018-19\\)", "",formatted_college_tab$TuitionFeesThousands )
formatted_college_tab$TuitionFeesThousands  <-sub("\\,", "",formatted_college_tab$TuitionFeesThousands )
formatted_college_tab$TuitionFeesThousands  <-sub("\\$", "",formatted_college_tab$TuitionFeesThousands )
formatted_college_tab$TuitionFeesThousands <- as.double(formatted_college_tab$TuitionFeesThousands)/1000
## Warning: NAs introduced by coercion
#fixing RoomBoard formatting
formatted_college_tab$RoomBoardThousands <- sub("\n            ", "",formatted_college_tab$RoomBoardThousands )
formatted_college_tab$RoomBoardThousands <- sub(" \\(2018-19\\)", "",formatted_college_tab$RoomBoardThousands )
formatted_college_tab$RoomBoardThousands  <-sub("\\,", "",formatted_college_tab$RoomBoardThousands )
formatted_college_tab$RoomBoardThousands  <-sub("\\$", "",formatted_college_tab$RoomBoardThousands )
formatted_college_tab$RoomBoardThousands <- as.double(formatted_college_tab$RoomBoardThousands)/1000
## Warning: NAs introduced by coercion
#fixing Enrollment formatting
formatted_college_tab$TotalEnrollment <- sub("\n            ", "",formatted_college_tab$TotalEnrollment )
formatted_college_tab$TotalEnrollment  <-sub("\\,", "",formatted_college_tab$TotalEnrollment )
formatted_college_tab$TotalEnrollment <- as.double(formatted_college_tab$TotalEnrollment)


formatted_college_tab <- formatted_college_tab %>% mutate(TotalCostThousands =TuitionFeesThousands + RoomBoardThousands )

formatted_college_tab <- na.omit(formatted_college_tab)
nrow(formatted_college_tab)
## [1] 107
head(formatted_college_tab)
##                                                                               URL
## 1                  https://www.usnews.com/best-colleges/princeton-university-2627
## 2                    https://www.usnews.com/best-colleges/harvard-university-2155
## 3                   https://www.usnews.com/best-colleges/columbia-university-2707
## 4 https://www.usnews.com/best-colleges/massachusetts-institute-of-technology-2178
## 5                 https://www.usnews.com/best-colleges/university-of-chicago-1774
## 6                       https://www.usnews.com/best-colleges/yale-university-1426
##                             CollegeName TuitionFeesThousands
## 1                  Princeton University               47.140
## 2                    Harvard University               50.420
## 3                   Columbia University               59.430
## 4 Massachusetts Institute of Technology               51.832
## 5                 University of Chicago               57.006
## 6                       Yale University               53.430
##   RoomBoardThousands TotalEnrollment    SchoolType YearFounded  Setting
## 1             15.610            8273 Private, Coed        1746 Suburban
## 2             17.160           20604 Private, Coed        1636    Urban
## 3             14.016           25968 Private, Coed        1754    Urban
## 4             15.510           11466 Private, Coed        1861    Urban
## 5             16.350           13736 Private, Coed        1890    Urban
## 6             16.000           12974 Private, Coed        1701     City
##   Endowment2017Millions MedianStartingSalaryOfAlumniThousands
## 1                 23400                                  68.4
## 2                 37100                                  66.5
## 3                 10000                                  64.9
## 4                 14800                                  79.8
## 5                  6600                                  57.7
## 6                 27200                                  63.2
##      Selectivity Fall2017AcceptanceRate MalePercentage
## 1 Most selective                   0.06           0.51
## 2 Most selective                   0.05           0.52
## 3 Most selective                   0.06           0.52
## 4 Most selective                   0.07           0.54
## 5 Most selective                   0.09           0.51
## 6 Most selective                   0.07           0.50
##   FourYearGraduationRate TotalCostThousands
## 1                   0.89             62.750
## 2                   0.84             67.580
## 3                   0.88             73.446
## 4                   0.85             67.342
## 5                   0.88             73.356
## 6                   0.87             69.430

{r} to save as csv to easily work on it without having to reload write.csv(formatted_college_tab, file = "college_info.csv") ``{r} formatted_college_tab <- read.csv(“college_info.csv”) formatted_college_tab <- formatted_college_tab[,-c(1)] formatted_college_tab ```

Part 2: Data Visualization

#Starting Salary
#-histograms
library(ggplot2)
plot_1 <- formatted_college_tab %>%
  ggplot(aes(MedianStartingSalaryOfAlumniThousands)) +
    geom_histogram()+ 
    labs(title="Starting Salary Distribution", x="Median Starting Salary of Alumni (Thousands)", y="Count")
plot_1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The distribution of the median starting salary of alumni from all the school seems to be a bell-shaped curve, centering around $55,000.

#Tuition Cost
#-histograms
library(ggplot2)
plot_2 <- formatted_college_tab %>%
  ggplot(aes(TuitionFeesThousands)) +
    geom_histogram()+ 
        labs(title="Tuition Cost Distribution", x="Tuition Cost (Thousands)", y="Count")
plot_2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The distribution of tution costs of all the schools is bimodal, with a range of $60,000.

#Acceptance rate vs graduation rate

library(ggplot2)
plot_3 <- formatted_college_tab %>%
  ggplot(aes(x=Fall2017AcceptanceRate, y=FourYearGraduationRate)) +
    geom_point()+ 
    geom_smooth(method=lm)+
        labs(title="Acceptance Vs. Graduation Rate", x="Fall 2017 Acceptance Rate", y="Four Year Graduation Rate")
plot_3

There is a linear relationship between acceptance rate (Fall 2017) and the four year graduation rate. It is an overall negative relationship. The more selective, the lower the rate of graduation.

#Boxplots of (1) gradrate & (2) admission rate by selectivity 

library(ggplot2)
plot_4 <- formatted_college_tab %>%
  ggplot(aes(x=Selectivity, y=FourYearGraduationRate)) +
    geom_boxplot()+
        labs(title="Graduation Rate based on Selectivity", x="Selectivity Level", y="Four Year Graduation Rate")
plot_4

This is significant difference in four year graduation rates based on their Selectivity Level of accepting students. These boxplots show that each 3 selectivity level vary significantly on range and central tendency. The more selective a college is, the greater their graduation rates seem to be.

#Setting vs. room board

library(ggplot2)
plot_5 <- formatted_college_tab %>%
  ggplot(aes(x=Setting, y=RoomBoardThousands)) +
    geom_boxplot()+
        labs(title="Setting vs. Room & Board Costs", x="Setting", y="Room & Board Costs (Thousands)")
plot_5

The boxplots of room & board costs based on setting shows that the setting of the college does influence the room and board costs for the students. The medians vary greatly. The interquartile spreads seems to be similar while the overall ranges vary.

plot_6 <- formatted_college_tab %>%
  ggplot(aes(x=TotalCostThousands, y=MedianStartingSalaryOfAlumniThousands)) +
    geom_point()+ 
    geom_smooth(method=lm)+
        labs(title="Total Cost vs. Median Starting Salary", x="Total Cost (Thousand)", y="Median Starting Salary Of Alumni (Thousands)")
plot_6

There appears to be a positive linear relatinoship between median starting salary and total cost of colleges. The general trends shows that the more students spend on tution and room & board, the more likely that their starting salary is higher.

plot_7 <- formatted_college_tab %>%
  ggplot(aes(x=SchoolType, y=MedianStartingSalaryOfAlumniThousands  
)) +
    geom_boxplot()+
        labs(title="Median Starting Salary Of Alumni Based on School Type  ", x="School Type", y="Median Starting Salary Of Alumni (Thousands)")
plot_7

Between school types, private colleges seem to have greater starting salaries than public schools, based on the medians of these boxplots.

 formatted_college_tab %>% group_by(Selectivity) %>%
  summarise(n())
## # A tibble: 3 x 2
##   Selectivity    `n()`
##   <fct>          <int>
## 1 More selective    61
## 2 Most selective    44
## 3 Selective          2

Part 3: Model Fitting and Selection

Fitting model for tuition prices

#adjusting dataset to remove variables not able to be used in model fitting
college_info <- formatted_college_tab[,-c(1,2)]
head(college_info)
##   TuitionFeesThousands RoomBoardThousands TotalEnrollment    SchoolType
## 1               47.140             15.610            8273 Private, Coed
## 2               50.420             17.160           20604 Private, Coed
## 3               59.430             14.016           25968 Private, Coed
## 4               51.832             15.510           11466 Private, Coed
## 5               57.006             16.350           13736 Private, Coed
## 6               53.430             16.000           12974 Private, Coed
##   YearFounded  Setting Endowment2017Millions
## 1        1746 Suburban                 23400
## 2        1636    Urban                 37100
## 3        1754    Urban                 10000
## 4        1861    Urban                 14800
## 5        1890    Urban                  6600
## 6        1701     City                 27200
##   MedianStartingSalaryOfAlumniThousands    Selectivity
## 1                                  68.4 Most selective
## 2                                  66.5 Most selective
## 3                                  64.9 Most selective
## 4                                  79.8 Most selective
## 5                                  57.7 Most selective
## 6                                  63.2 Most selective
##   Fall2017AcceptanceRate MalePercentage FourYearGraduationRate
## 1                   0.06           0.51                   0.89
## 2                   0.05           0.52                   0.84
## 3                   0.06           0.52                   0.88
## 4                   0.07           0.54                   0.85
## 5                   0.09           0.51                   0.88
## 6                   0.07           0.50                   0.87
##   TotalCostThousands
## 1             62.750
## 2             67.580
## 3             73.446
## 4             67.342
## 5             73.356
## 6             69.430
college_info$FourYearGraduationRate <- college_info$FourYearGraduationRate*100
college_info$MalePercentage <- college_info$MalePercentage*100
college_info$Fall2017AcceptanceRate <- college_info$Fall2017AcceptanceRate*100
#linear model fitting 
tuition_lm_1 <- lm(TuitionFeesThousands~.-RoomBoardThousands-TotalCostThousands, data = college_info)
summary(tuition_lm_1)
## 
## Call:
## lm(formula = TuitionFeesThousands ~ . - RoomBoardThousands - 
##     TotalCostThousands, data = college_info)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.860  -2.743   0.146   3.237  14.500 
## 
## Coefficients:
##                                         Estimate Std. Error t value
## (Intercept)                            1.491e+01  2.646e+01   0.563
## TotalEnrollment                       -3.283e-05  5.921e-05  -0.555
## SchoolTypePublic, Coed                -1.150e+01  1.909e+00  -6.021
## YearFounded                            4.876e-03  1.305e-02   0.374
## SettingRural                           2.112e-01  2.878e+00   0.073
## SettingSuburban                        1.191e+00  1.695e+00   0.703
## SettingUrban                           1.695e+00  1.611e+00   1.052
## Endowment2017Millions                 -5.211e-05  1.483e-04  -0.351
## MedianStartingSalaryOfAlumniThousands  1.396e-01  1.705e-01   0.819
## SelectivityMost selective              4.546e+00  2.168e+00   2.096
## SelectivitySelective                  -6.810e+00  4.740e+00  -1.437
## Fall2017AcceptanceRate                 5.185e-02  5.897e-02   0.879
## MalePercentage                         2.208e-02  1.319e-01   0.167
## FourYearGraduationRate                 1.754e-01  4.529e-02   3.873
##                                       Pr(>|t|)    
## (Intercept)                             0.5745    
## TotalEnrollment                         0.5806    
## SchoolTypePublic, Coed                3.42e-08 ***
## YearFounded                             0.7095    
## SettingRural                            0.9417    
## SettingSuburban                         0.4841    
## SettingUrban                            0.2957    
## Endowment2017Millions                   0.7262    
## MedianStartingSalaryOfAlumniThousands   0.4151    
## SelectivityMost selective               0.0388 *  
## SelectivitySelective                    0.1541    
## Fall2017AcceptanceRate                  0.3815    
## MalePercentage                          0.8674    
## FourYearGraduationRate                  0.0002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.182 on 93 degrees of freedom
## Multiple R-squared:  0.7003, Adjusted R-squared:  0.6584 
## F-statistic: 16.72 on 13 and 93 DF,  p-value: < 2.2e-16
plot(tuition_lm_1)

tuition_lm_2 <- step(tuition_lm_1, direction = "both", steps = 1000, trace = F)
summary(tuition_lm_2)
## 
## Call:
## lm(formula = TuitionFeesThousands ~ SchoolType + Selectivity + 
##     FourYearGraduationRate, data = college_info)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.663  -2.870   0.402   3.025  14.015 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                37.98706    2.52629  15.037  < 2e-16 ***
## SchoolTypePublic, Coed    -12.40552    1.29191  -9.602 6.18e-16 ***
## SelectivityMost selective   4.03690    1.42380   2.835 0.005521 ** 
## SelectivitySelective       -7.47358    4.37232  -1.709 0.090437 .  
## FourYearGraduationRate      0.14329    0.03637   3.940 0.000149 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.049 on 102 degrees of freedom
## Multiple R-squared:  0.6854, Adjusted R-squared:  0.6731 
## F-statistic: 55.55 on 4 and 102 DF,  p-value: < 2.2e-16
plot(tuition_lm_2)

anova(tuition_lm_2,tuition_lm_1, test="Chisq")
## Analysis of Variance Table
## 
## Model 1: TuitionFeesThousands ~ SchoolType + Selectivity + FourYearGraduationRate
## Model 2: TuitionFeesThousands ~ (RoomBoardThousands + TotalEnrollment + 
##     SchoolType + YearFounded + Setting + Endowment2017Millions + 
##     MedianStartingSalaryOfAlumniThousands + Selectivity + Fall2017AcceptanceRate + 
##     MalePercentage + FourYearGraduationRate + TotalCostThousands) - 
##     RoomBoardThousands - TotalCostThousands
##   Res.Df    RSS Df Sum of Sq Pr(>Chi)
## 1    102 3731.7                      
## 2     93 3554.7  9    176.99   0.8653

Fitting model for graduation rate

#linear model fitting 
gradrate_lm_1 <- lm(MedianStartingSalaryOfAlumniThousands~.-TuitionFeesThousands-RoomBoardThousands, data = na.omit(college_info))
summary(gradrate_lm_1)
## 
## Call:
## lm(formula = MedianStartingSalaryOfAlumniThousands ~ . - TuitionFeesThousands - 
##     RoomBoardThousands, data = na.omit(college_info))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3062 -2.3889 -0.2445  1.8739 15.6698 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                1.976e+01  1.595e+01   1.239   0.2186    
## TotalEnrollment            8.453e-06  3.614e-05   0.234   0.8156    
## SchoolTypePublic, Coed    -1.430e+00  1.325e+00  -1.079   0.2833    
## YearFounded                5.017e-03  7.928e-03   0.633   0.5284    
## SettingRural               1.384e+00  1.741e+00   0.795   0.4286    
## SettingSuburban           -2.983e-01  1.032e+00  -0.289   0.7732    
## SettingUrban               3.407e-01  9.871e-01   0.345   0.7308    
## Endowment2017Millions      2.166e-04  8.718e-05   2.485   0.0147 *  
## SelectivityMost selective  2.481e+00  1.326e+00   1.872   0.0644 .  
## SelectivitySelective       2.567e+00  2.885e+00   0.890   0.3759    
## Fall2017AcceptanceRate    -8.235e-02  3.478e-02  -2.368   0.0200 *  
## MalePercentage             5.486e-01  5.637e-02   9.732 7.55e-16 ***
## FourYearGraduationRate     1.843e-02  2.907e-02   0.634   0.5276    
## TotalCostThousands         3.458e-02  5.496e-02   0.629   0.5307    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.752 on 93 degrees of freedom
## Multiple R-squared:  0.7248, Adjusted R-squared:  0.6864 
## F-statistic: 18.85 on 13 and 93 DF,  p-value: < 2.2e-16
plot(gradrate_lm_1)

gradrate_lm_2 <- step(gradrate_lm_1, direction = "both", steps = 1000, trace = F)
summary(gradrate_lm_2)
## 
## Call:
## lm(formula = MedianStartingSalaryOfAlumniThousands ~ SchoolType + 
##     Endowment2017Millions + Selectivity + Fall2017AcceptanceRate + 
##     MalePercentage, data = na.omit(college_info))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5908 -2.3830 -0.3075  1.8991 15.2461 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                3.341e+01  2.874e+00  11.625  < 2e-16 ***
## SchoolTypePublic, Coed    -1.912e+00  8.010e-01  -2.386  0.01889 *  
## Endowment2017Millions      1.998e-04  7.306e-05   2.735  0.00738 ** 
## SelectivityMost selective  2.697e+00  1.244e+00   2.168  0.03252 *  
## SelectivitySelective       2.042e+00  2.713e+00   0.752  0.45357    
## Fall2017AcceptanceRate    -9.100e-02  3.173e-02  -2.868  0.00504 ** 
## MalePercentage             5.428e-01  4.828e-02  11.243  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.672 on 100 degrees of freedom
## Multiple R-squared:  0.7166, Adjusted R-squared:  0.6996 
## F-statistic: 42.15 on 6 and 100 DF,  p-value: < 2.2e-16
plot(gradrate_lm_2)

anova(gradrate_lm_2,gradrate_lm_1, test="Chisq")
## Analysis of Variance Table
## 
## Model 1: MedianStartingSalaryOfAlumniThousands ~ SchoolType + Endowment2017Millions + 
##     Selectivity + Fall2017AcceptanceRate + MalePercentage
## Model 2: MedianStartingSalaryOfAlumniThousands ~ (TuitionFeesThousands + 
##     RoomBoardThousands + TotalEnrollment + SchoolType + YearFounded + 
##     Setting + Endowment2017Millions + Selectivity + Fall2017AcceptanceRate + 
##     MalePercentage + FourYearGraduationRate + TotalCostThousands) - 
##     TuitionFeesThousands - RoomBoardThousands
##   Res.Df    RSS Df Sum of Sq Pr(>Chi)
## 1    100 1348.5                      
## 2     93 1309.3  7     39.15   0.9045